Downloading the dataset

Exercise 1

  allsites_zip_url <- "http://academic.udayton.edu/kissock/http/Weather/gsod95-current/allsites.zip"

Exercise 2

  # Specifying a data directory:
  data_dir <-path("datas")

Exercise 3

  allsites_zip_path <-path(data_dir, "allsites", ext ="zip")
  allsites_zip_path
## datas/allsites.zip

Exercise 4

if (!file_exists(allsites_zip_path)) {  
    allsites_zip_url %>%curl_download(destfile = allsites_zip_path)
  }

Exercise 5

file_exists(allsites_zip_path)
## datas/allsites.zip 
##               TRUE
!file_exists(allsites_zip_path)
## datas/allsites.zip 
##              FALSE

Basically, if the file already exists at the download path, we will not download the file again. If it does not exist, we will download the file.

Unzipping the data files

Exercise 6

allsites_zip_path %>%
  unzip(exdir = data_dir)

It unzipped the zip file into multiple different text files.

Extracting information from a file name

Exercise 7

data_file <-path(data_dir, "ALHUNTSV", ext ="txt")

Exercise 8

data_file_name <- data_file %>% 
  path_file() %>% 
  path_ext_remove()

Exercise 9

file_state <- data_file_name %>%
  str_sub(0,2)
file_city <- data_file_name %>%
  str_sub(3)
file_state
## [1] "AL"
file_city
## [1] "HUNTSV"

Read a data file, label the dataset

Exercise 10

alhuntsv <- data_file %>%
  read_table(col_names = c("Month","Day","Year","temp")) %>%
  mutate(state = file_state,city= file_city, date = make_date(year = Year, month = Month, day = Day))

User-defined functions

Exercise 11

add <-function(number1, number2) {  
  result <- number1 + number2
  cat(number1, "plus", number2, "equals", result, "\n")
}
add(4,6)
## 4 plus 6 equals 10
add(50,500)
## 50 plus 500 equals 550

Exercise 12

add_fix <-function(number1, number2) {  
  result <-number1 +number2
  cat(number1, "plus", number2, "equals", result, "\n")  
  result
}
add_result <- add_fix(4,6)
## 4 plus 6 equals 10

Exercise 13

mpg_density_plot <-function(variable) {  
  user_input <-rlang::enquo(variable)
  ggplot(data = mpg) +
    geom_density(mapping =aes(x =!!user_input))
  }
mpg_density_plot(hwy)

mpg_density_plot(cty)

mpg_density_plot(displ)

Function to read a data file

Exercise 14

read_data_file <-function(data_file) {
  file_name <- data_file %>%
    path_file() %>%
    path_ext_remove()
  # file_state <- Code to get two-letter state/country code from filename
  file_state <- file_name %>%
    str_sub(0,2)
  # file_city <- Code to get city names from filename
  file_city <- file_name %>%
    str_sub(3)
  # col_names <- Code to list column names
  temperature_data_frame <- data_file %>%
  read_table(col_names = c("Month","Day","Year","temperature")) %>%
  mutate(state = file_state,city= file_city, date = make_date(year = Year, month = Month, day = Day))
  #temperature_data_frame <- Code to read and label the data  
  
  temperature_data_frame
  }
alhuntsv2 <- read_data_file(data_file)

Read all the data files

Exercise 15

data_files <-data_dir %>%
  dir_ls(glob ="*.txt")
temperature_df <-data_files %>%
  map_dfr(read_data_file) %>%
  mutate(temperature =if_else(near(temperature, -99), as.numeric(NA), temperature))

Its to remove outliers or typos that could skew the database heavily.

Explore your new temperature database

Exercise 16

  temperature_df_filtered <- temperature_df %>%
  filter(!is.na(temperature)) %>%
  filter(Year != 2018)

Exercise 17

  washdc <- temperature_df_filtered %>%
  filter(city == "WASHDC")

Exercise 18

  washdc %>%
  ggplot(mapping = aes(x = date, y = temperature)) +
  geom_point()

The constant oscillation is due to the change in the seasons, it becomes colder in winter and warmer in summer.

Exercise 19

washdc_year <- washdc %>%
  group_by(Year)

washdc_year %>%
  ggplot(mapping = aes(x = Year, y = temperature)) +
    geom_point() +
    geom_smooth()

washdc_year %>%
  ggplot(mapping = aes(x = temperature, y = Year)) +
    geom_point() +
    geom_smooth()

The average temperature per year is has risen by about 2.5-5 degrees since the first collection in 1995.

Exercise 20

world_year <- temperature_df_filtered %>%
  group_by(Year)